import pandas as pd
from gq.med_qa.med_const import Med_Const
import jieba
import re


def load_data():
    data = pd.read_csv('./data/medQA.train.csv', encoding='utf-8')
    return data


def clean_text(text):
    patten = r"[!\"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~—！，。？·￥、《》···【】：" "''\s0-9]+"
    re_obj = re.compile(patten)
    text = re_obj.sub('', text)
    return text


def cut_word(text):
    text_list = jieba.lcut(text)
    return text_list


def remove_stop_words(text):
    stop_words = set(i.strip() for i in open(Med_Const.stop_word_path, encoding='utf-8').readlines())
    text_list = []
    for word in text:
        if word not in stop_words:
            text_list.append(word)
    return text_list


def process():
    data = load_data()
    data['question'] = data["question"].astype('str')
    data["question"] = data["question"].apply(clean_text)
    data['question'] = data['question'].apply(cut_word)
    data['question'] = data['question'].apply(remove_stop_words)
    return data


def get_data():
    train_data = pd.read_csv(Med_Const.train, encoding='utf-8')
    test_data = pd.read_csv(Med_Const.test, encoding='utf-8')
    x_train = train_data["question"]
    y_train = train_data["label"]
    x_test = test_data["question"]
    y_test = test_data["label"]
    return x_train, y_train, x_test, y_test


if __name__ == '__main__':
    res = get_data()
    print(res)
